################################################ ### Downloading necessary support packages ### ### to run the R code ### ################################################ install.packages(c("tree", "ISLR")) library(tree) library(ISLR) ################################################ ###"Carseats" is a built-in data set inside R### ### A data set with 400 observations about ### ### the sales of child car seats. It has 11 ### ### variables or characteristics. ### ### Source: rdrr.io/cran/ISLR/Carseats.html ### ################################################ attach(Carseats) ### Checking the data set ### dim(Carseats) ### number of rows & columns head(Carseats) ### first 6 lines of the data tail(Carseats) ### last 6 lines of the data ################################################### ### Analysis objective: what factors contribute ### ### to the increased sales of child car seats ### ### Use classification tree to predict high ### ### sales of car seats. ### ################################################### ### Creating a new variable to indicate 'high' ### ### & 'low' sales based on the existing variable### ### 'Sales'. If sales is less than 8000, then it### ### is a low sale otherwise sale is high. ### High=factor(ifelse(Sales<=8,"No","Yes")) ### Add the new variable 'High' to the Carseats data ### Carseats=data.frame(Carseats,High) ### Check to make sure everything worked ### dim(Carseats) head(Carseats) tail(Carseats) ### Fitting a classification tree on the Carseats data ### ### Splitting the data randomly into two parts: training ### ### & test sets. ### Classification tree is developed using the training set ### ### Test set is used to verify the prediction ### ### Using a seed guarantees the same result always ### set.seed(123) ## Training set is created by randomly selecting ### ### 200 obs. from the Carseats data ### train=sample(1:nrow(Carseats), 200) ## Test set is created using the remaining 200 obs.### Carseats.test=Carseats[-train,] High.test=High[-train] ### Fitting the classification tree on the training set ### tree.carseats=tree(High~.-Sales,Carseats,subset=train) summary(tree.carseats) plot(tree.carseats) text(tree.carseats,pretty=0) ### "Shelve location", "Age", and "Price" are the ### ### 3 most important indicators of high sales ### ### Prediction of the test data by 'tree.carseats' ### tree.pred=predict(tree.carseats,Carseats.test,type="class") ### Comparing the prediction with actual class labels ### table(tree.pred,High.test) ### Compute the accuracy rate ### ### Performing Cross-validation and pruning ### ### to improve prediction quality. Both the ### ### processes were guided by classification error ### set.seed(456) cv.carseats=cv.tree(tree.carseats,FUN=prune.misclass) names(cv.carseats) cv.carseats cv.carseats$size ## 'size' gives the number of terminal nodes cv.carseats$dev ## 'dev' corresponds to cross-validation error rate ## The tree with 10 terminal nodes has the lowest CV error of 62. ## Pruning the tree prune.carseats=prune.misclass(tree.carseats,best=10) plot(prune.carseats) text(prune.carseats,pretty=0) ## Check to see if pruning improved results tree.pred=predict(prune.carseats,Carseats.test,type="class") table(tree.pred,High.test)